In [1]:
# boilerplate includes
import sys
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.basemap import Basemap
import matplotlib.patheffects as path_effects
import pandas as pd
import seaborn as sns
import datetime
# import scipy.interpolate
# import re
from IPython.display import display, HTML
%matplotlib notebook
plt.style.use('seaborn-notebook')
pd.set_option('display.max_columns', None)
In [2]:
STATIONS = [
'KLAX',
'KSFO',
'KSAN',
'KMIA',
'KFAT',
'KJAX',
'KTPA',
'KRIV',
'KIAH',
'KMCO',
'KBUR',
# 'KSNA', # Not good
# 'KONT', # Not good, 1973+ might be usable but 2001 has an 86 day and 41 day gap.
# 'KHST', # Not good
# 'KFUL', # Bad
]
TEMPERATURE_DATADIR = '../data/temperatures'
# The label of the hourly temperature column
TEMP_COL = 'AT'
# Gaps in the data this big or longer will be considerd 'breaks'...
# so the 'good' continuous timeseries will start at the end of the last big gap
BIG_GAP_LENGTH = pd.Timedelta('14 days')
# # Time range to use for computing normals (30 year, just like NOAA uses)
# NORM_IN_START_DATE = '1986-07-01'
# NORM_IN_END_DATE = '2016-07-01'
# # Time range or normals to output to use when running 'medfoes on normal temperature' (2 years, avoiding leapyears)
# NORM_OUT_START_DATE = '2014-01-01'
# NORM_OUT_END_DATE = '2015-12-31 23:59:59'
In [ ]:
In [3]:
def read_isd_history_stations_list(filename, skiprows=22):
"""Read and parse stations information from isd_history.txt file"""
fwfdef = (( ('USAF', (6, str)),
('WBAN', (5, str)),
('STATION NAME', (28, str)),
('CTRY', (4, str)),
('ST', (2, str)),
('CALL', (5, str)),
('LAT', (7, str)),
('LON', (8, str)),
('EVEV', (7, str)),
('BEGIN', (8, str)),
('END', (8, str)),
))
names = []
colspecs = []
converters = {}
i = 0
for k,v in fwfdef:
names.append(k)
colspecs.append((i, i+v[0]+1))
i += v[0]+1
converters[k] = v[1]
stdf = pd.read_fwf(filename, skiprows=skiprows,
names=names,
colspecs=colspecs,
converters=converters)
return stdf
In [4]:
#df = pd.DataFrame(columns=['callsign', 'data start', 'data end', 'good start', 'gaps after good start'],
tmp = []
for callsign in STATIONS:
# load the temperature dataset
fn = "{}_AT_cleaned.h5".format(callsign)
ot = pd.read_hdf(os.path.join(TEMPERATURE_DATADIR,fn), 'table')
# load the gaps information
gaps = pd.read_csv(os.path.join(TEMPERATURE_DATADIR,"{}_AT_gaps.tsv".format(callsign)),
sep='\t', comment='#',
names=['begin','end','length'],
parse_dates=[0,1])
# convert length to an actual timedelta
gaps['length'] = pd.to_timedelta(gaps['length'])
# make sure begin and end have the right timezone association (UTC)
gaps['begin'] = gaps['begin'].apply(lambda x: x.tz_localize('UTC'))
gaps['end'] = gaps['end'].apply(lambda x: x.tz_localize('UTC'))
big_gaps = gaps[gaps['length'] >= BIG_GAP_LENGTH]
end_of_last_big_gap = big_gaps['end'].max()
if pd.isnull(end_of_last_big_gap): # No big gaps... so use start of data
end_of_last_big_gap = ot.index[0]
num_gaps_after_last_big_gap = gaps[gaps['end'] > end_of_last_big_gap].shape[0]
print(callsign,
ot.index[0],
ot.index[-1],
end_of_last_big_gap,
num_gaps_after_last_big_gap,
sep='\t')
tmp.append([
callsign,
ot.index[0],
ot.index[-1],
end_of_last_big_gap,
num_gaps_after_last_big_gap,
])
#display(big_gaps)
df = pd.DataFrame(tmp, columns=['callsign', 'data start', 'data end', 'good start', 'gaps after good start']).set_index('callsign')
In [5]:
df['good start']
Out[5]:
isd-history.txt
metadata fileOnly take rows where the callsign matches one we are interested in.
There will be multiple entries for each callsign.
Station data formats and id numbers changed over the years.
Also, sometimes stations were moved (small distances).
These changes resulted in separate entries with the same callsign.
In [6]:
historydf = read_isd_history_stations_list(
os.path.join(TEMPERATURE_DATADIR,'ISD/isd-history.txt'))
df.join(historydf.set_index('CALL'))
Out[6]:
In [7]:
sthistdf = historydf.set_index('CALL').loc[df.index]
sthistdf = sthistdf.reset_index().sort_values(['CALL','END'], ascending=[True,False]).set_index('CALL')
foo = sthistdf[~sthistdf.index.duplicated('first')]
foo = foo.join(df)
foo.drop('END',1, inplace=True)
foo.drop('BEGIN',1, inplace=True)
foo = foo.reindex(STATIONS)
foo
Out[7]:
In [9]:
# Save the summary table
foo.to_csv('stations_summary.csv')
In [11]:
tmp = foo[['STATION NAME','ST','LAT','LON',
'EVEV','good start','data end']].sort_values(['ST','LAT'], ascending=[True,False])
display(tmp)
In [ ]: